import re

import scrapy

file = open('gs_no_title.txt', 'a+', encoding='utf-8')

class Gs(scrapy.Spider):
    name = 'gs'
    allowed_domains = ['so.gushiwen.cn']
    start_urls = ['https://so.gushiwen.cn/shiwens/']

    def parse(self, response):
        for sel in response.xpath('//div[@id="type2"]/div[@class="sright"]/a'):
            url = response.urljoin(sel.xpath('@href').extract_first())
            yield scrapy.Request(url, callback=self.parse_content)

    def parse_content(self, response):
        for sel in response.xpath('//div[@id="leftZhankai"]/div[@class="sons"]'):
            title = sel.xpath('div[@class="cont"]/p/a/b/text()').extract()
            author = sel.xpath('div[@class="cont"]/p[@class="source"]/a/text()').extract_first()
            content = sel.xpath('div[@class="cont"]/div[@class="contson"]')
            if content.xpath('p'):
                content = content.xpath('p/text()').re(r'\s?(\S+?。)')
            else:
                content = content.xpath('text()').re(r'\s?(\S+?。)')
            for index in range(len(content)):
                content[index] = content[index] + '\n'
            title[0] = '\n' + title[0] + ' - ' + author + '\n'
            # file.writelines(title)
            file.writelines(content)
            file.write('\n')

        next_page = response.xpath('//a[@class="amore"]/@href').extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page), callback=self.parse_content)
